Load Libraries

library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.4
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data = readRDS("../../data/data.rds")
str(data)
## 'data.frame':    75866 obs. of  22 variables:
##  $ Race         : chr  "10M" "10M" "10M" "10M" ...
##  $ Name         : chr  "Jane Omoro " "Jane Ngotho " "Lidiya Grigoryeva " "Eunice Sagero " ...
##  $ Gender       : chr  "W" "W" "W" "W" ...
##  $ Age          : num  26 29 NA 20 29 24 38 NA 27 30 ...
##  $ Time         : POSIXct, format: "0000-01-01 00:53:37" "0000-01-01 00:53:38" ...
##  $ Pace         : POSIXct, format: "0000-01-01 00:05:22" "0000-01-01 00:05:22" ...
##  $ PiS          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ TiS          : num  2358 2358 2358 2358 2358 ...
##  $ Division     : chr  "W2529" "W2529" "NR" "W2024" ...
##  $ PiD          : num  1 2 NA 1 3 2 1 NA 4 1 ...
##  $ TiD          : num  559 559 NA 196 559 196 387 NA 559 529 ...
##  $ Hometown     : chr  "Kenya" "Kenya" "Russia" "Kenya" ...
##  $ Home State   : chr  NA NA NA NA ...
##  $ year         : int  1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
##  $ divisionTitle: chr  "Overall+Women" "Overall+Women" "Overall+Women" "Overall+Women" ...
##  $ section      : chr  "10M" "10M" "10M" "10M" ...
##  $ page         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ link         : chr  "http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1" "http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1" "http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1" "http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1" ...
##  $ DivisionCode : chr  "W" "W" "N" "W" ...
##  $ DivisionNum  : chr  "529" "529" "" "024" ...
##  $ TimeMins     : num  53.6 53.6 53.7 53.9 54.1 ...
##  $ PaceMins     : num  5.37 5.37 5.37 5.4 5.42 ...
summary(data)
##      Race               Name              Gender               Age       
##  Length:75866       Length:75866       Length:75866       Min.   : 7.00  
##  Class :character   Class :character   Class :character   1st Qu.:27.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :32.00  
##                                                           Mean   :33.85  
##                                                           3rd Qu.:39.00  
##                                                           Max.   :87.00  
##                                                           NA's   :20     
##       Time                          Pace                          PiS      
##  Min.   :0000-01-01 00:51:44   Min.   :0000-01-01 00:05:10   Min.   :   1  
##  1st Qu.:0000-01-01 01:28:39   1st Qu.:0000-01-01 00:08:52   1st Qu.:1356  
##  Median :0000-01-01 01:37:29   Median :0000-01-01 00:09:45   Median :2786  
##  Mean   :0000-01-01 01:38:13   Mean   :0000-01-01 00:09:50   Mean   :3305  
##  3rd Qu.:0000-01-01 01:46:58   3rd Qu.:0000-01-01 00:10:42   3rd Qu.:4905  
##  Max.   :0000-01-01 02:57:31   Max.   :0000-01-01 00:17:45   Max.   :9729  
##                                                                            
##       TiS         Division              PiD              TiD      
##  Min.   :2166   Length:75866       Min.   :   1.0   Min.   :   1  
##  1st Qu.:4333   Class :character   1st Qu.: 165.0   1st Qu.: 559  
##  Median :6395   Mode  :character   Median : 404.0   Median : 953  
##  Mean   :6609                      Mean   : 595.6   Mean   :1190  
##  3rd Qu.:8853                      3rd Qu.: 816.0   3rd Qu.:1678  
##  Max.   :9729                      Max.   :5302.0   Max.   :2803  
##                                    NA's   :20       NA's   :20    
##    Hometown          Home State             year      divisionTitle     
##  Length:75866       Length:75866       Min.   :1999   Length:75866      
##  Class :character   Class :character   1st Qu.:2005   Class :character  
##  Mode  :character   Mode  :character   Median :2008   Mode  :character  
##                                        Mean   :2007                     
##                                        3rd Qu.:2010                     
##                                        Max.   :2012                     
##                                                                         
##    section               page           link           DivisionCode      
##  Length:75866       Min.   :  1.0   Length:75866       Length:75866      
##  Class :character   1st Qu.: 68.0   Class :character   Class :character  
##  Mode  :character   Median :140.0   Mode  :character   Mode  :character  
##                     Mean   :165.7                                        
##                     3rd Qu.:246.0                                        
##                     Max.   :487.0                                        
##                                                                          
##  DivisionNum           TimeMins         PaceMins     
##  Length:75866       Min.   : 51.73   Min.   : 5.167  
##  Class :character   1st Qu.: 88.65   1st Qu.: 8.867  
##  Mode  :character   Median : 97.48   Median : 9.750  
##                     Mean   : 98.22   Mean   : 9.823  
##                     3rd Qu.:106.97   3rd Qu.:10.700  
##                     Max.   :177.52   Max.   :17.750  
## 
head(data)
##   Race               Name Gender Age                Time                Pace
## 1  10M        Jane Omoro       W  26 0000-01-01 00:53:37 0000-01-01 00:05:22
## 2  10M       Jane Ngotho       W  29 0000-01-01 00:53:38 0000-01-01 00:05:22
## 3  10M Lidiya Grigoryeva       W  NA 0000-01-01 00:53:40 0000-01-01 00:05:22
## 4  10M     Eunice Sagero       W  20 0000-01-01 00:53:55 0000-01-01 00:05:24
## 5  10M   Alla Zhilyayeva       W  29 0000-01-01 00:54:08 0000-01-01 00:05:25
## 6  10M    Teresa Wanjiku       W  24 0000-01-01 00:54:10 0000-01-01 00:05:25
##   PiS  TiS Division PiD TiD Hometown Home State year divisionTitle section page
## 1   1 2358    W2529   1 559    Kenya       <NA> 1999 Overall+Women     10M    1
## 2   2 2358    W2529   2 559    Kenya       <NA> 1999 Overall+Women     10M    1
## 3   3 2358       NR  NA  NA   Russia       <NA> 1999 Overall+Women     10M    1
## 4   4 2358    W2024   1 196    Kenya       <NA> 1999 Overall+Women     10M    1
## 5   5 2358    W2529   3 559   Russia       <NA> 1999 Overall+Women     10M    1
## 6   6 2358    W2024   2 196    Kenya       <NA> 1999 Overall+Women     10M    1
##                                                                                                              link
## 1 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 2 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 3 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 4 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 5 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 6 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
##   DivisionCode DivisionNum TimeMins PaceMins
## 1            W         529 53.61667 5.366667
## 2            W         529 53.63333 5.366667
## 3            N             53.66667 5.366667
## 4            W         024 53.91667 5.400000
## 5            W         529 54.13333 5.416667
## 6            W         024 54.16667 5.416667
unique(data$Race)
## [1] "10M"
unique(data$Gender)
## [1] "W"       "xue Zhu" "suzy"    "Cindy"
unique(data %>% arrange(Age) %>%  select(Age) %>%  pluck(1)) 
##  [1]  7 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## [26] 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
## [51] 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 80 81 83 84 85
## [76] 86 87 NA
unique(data %>% arrange(Division) %>%  select(Division) %>%  pluck(1)) 
##  [1] "NR"    "W0119" "W2024" "W2529" "W3034" "W3539" "W4044" "W4549" "W5054"
## [10] "W5559" "W6064" "W6569" "W7074" "W7579" "W8099"
  • Some issues with Gender –> Does it matter?

Missing Values

data %>% 
  select_if(function(x) any(is.na(x))) %>% 
  summarise_each(~ sum(is.na(.))) 
##   Age PiD TiD Home State
## 1  20  20  20        241
noresults = data %>% dplyr::filter(Division == "NR")
dim(noresults)
## [1] 19 22
head(noresults)
##   Race                 Name Gender Age                Time                Pace
## 1  10M   Lidiya Grigoryeva       W  NA 0000-01-01 00:53:40 0000-01-01 00:05:22
## 2  10M        Gladys Asiba       W  NA 0000-01-01 00:54:50 0000-01-01 00:05:29
## 3  10M   Connie Buckwalter       W  NA 0000-01-01 00:59:36 0000-01-01 00:05:58
## 4  10M            Ann Reid       W  NA 0000-01-01 01:53:03 0000-01-01 00:11:18
## 5  10M        Loretta Cuce       W  NA 0000-01-01 01:53:38 0000-01-01 00:11:22
## 6  10M Unidentified Runner       W  NA 0000-01-01 01:19:45 0000-01-01 00:07:59
##    PiS  TiS Division PiD TiD   Hometown Home State year divisionTitle section
## 1    3 2358       NR  NA  NA     Russia       <NA> 1999 Overall+Women     10M
## 2    8 2358       NR  NA  NA      Kenya       <NA> 1999 Overall+Women     10M
## 3   17 2358       NR  NA  NA  Lancaster         PA 1999 Overall+Women     10M
## 4 2176 2358       NR  NA  NA   Bethesda         MD 1999 Overall+Women     10M
## 5 2611 2972       NR  NA  NA Alexandria         VA 2001 Overall+Women     10M
## 6  270 3333       NR  NA  NA Washington         DC 2002 Overall+Women     10M
##   page
## 1    1
## 2    1
## 3    1
## 4  109
## 5  131
## 6   14
##                                                                                                                link
## 1   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 2   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 3   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 4 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=109
## 5 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=2001&division=Overall+Women&page=131
## 6  http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=2002&division=Overall+Women&page=14
##   DivisionCode DivisionNum  TimeMins  PaceMins
## 1            N              53.66667  5.366667
## 2            N              54.83333  5.483333
## 3            N              59.60000  5.966667
## 4            N             113.05000 11.300000
## 5            N             113.63333 11.366667
## 6            N              79.75000  7.983333
noage = data %>% dplyr::filter(is.na(Age))
dim(noage)
## [1] 20 22
head(noage)
##   Race                 Name Gender Age                Time                Pace
## 1  10M   Lidiya Grigoryeva       W  NA 0000-01-01 00:53:40 0000-01-01 00:05:22
## 2  10M        Gladys Asiba       W  NA 0000-01-01 00:54:50 0000-01-01 00:05:29
## 3  10M   Connie Buckwalter       W  NA 0000-01-01 00:59:36 0000-01-01 00:05:58
## 4  10M            Ann Reid       W  NA 0000-01-01 01:53:03 0000-01-01 00:11:18
## 5  10M        Loretta Cuce       W  NA 0000-01-01 01:53:38 0000-01-01 00:11:22
## 6  10M Unidentified Runner       W  NA 0000-01-01 01:19:45 0000-01-01 00:07:59
##    PiS  TiS Division PiD TiD   Hometown Home State year divisionTitle section
## 1    3 2358       NR  NA  NA     Russia       <NA> 1999 Overall+Women     10M
## 2    8 2358       NR  NA  NA      Kenya       <NA> 1999 Overall+Women     10M
## 3   17 2358       NR  NA  NA  Lancaster         PA 1999 Overall+Women     10M
## 4 2176 2358       NR  NA  NA   Bethesda         MD 1999 Overall+Women     10M
## 5 2611 2972       NR  NA  NA Alexandria         VA 2001 Overall+Women     10M
## 6  270 3333       NR  NA  NA Washington         DC 2002 Overall+Women     10M
##   page
## 1    1
## 2    1
## 3    1
## 4  109
## 5  131
## 6   14
##                                                                                                                link
## 1   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 2   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 3   http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=1
## 4 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=1999&division=Overall+Women&page=109
## 5 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=2001&division=Overall+Women&page=131
## 6  http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=2002&division=Overall+Women&page=14
##   DivisionCode DivisionNum  TimeMins  PaceMins
## 1            N              53.66667  5.366667
## 2            N              54.83333  5.483333
## 3            N              59.60000  5.966667
## 4            N             113.05000 11.300000
## 5            N             113.63333 11.366667
## 6            N              79.75000  7.983333
setdiff(noage, noresults)
##   Race             Name Gender Age                Time                Pace  PiS
## 1  10M Michelle Hinman       W  NA 0000-01-01 01:39:13 0000-01-01 00:09:55 2455
##    TiS Division PiD TiD Hometown Home State year divisionTitle section page
## 1 4333    W8099   1   2       NR       <NA> 2005 Overall+Women     10M  123
##                                                                                                                link
## 1 http://www.cballtimeresults.org/performances?utf8=%E2%9C%93&section=10M&year=2005&division=Overall+Women&page=123
##   DivisionCode DivisionNum TimeMins PaceMins
## 1            W         099 99.21667 9.916667
  • This seems to be an issue. It looks like this person is placed first in the division, but age is not set so she has been classified as W8099 and the time seems to be off from what would be expected from this age bracket.
  • TODO: Fix this
# Remove no results
data = data %>% 
  dplyr::filter(Division != "NR")

Participants by Year

plotdata = data %>% 
  group_by(year) %>% 
  summarise(count=n()) 

p = plotdata %>% 
  ggplot(aes(x=year, y=count)) +
  geom_line() +
  geom_point()
ggplotly(p, tooltip="text")
p = plotdata %>% 
  ggplot(aes(x = year, y = count)) + 
  geom_bar(stat = "identity")
ggplotly(p, tooltip="text")
# Alternate colored by Division
plotdata = data %>% 
  group_by(year, Division) %>% 
  summarise(count = n())

p = plotdata %>% 
  ggplot(aes(x = year, y = count, fill = Division)) + 
  geom_bar(stat = "identity", position = "stack") 
ggplotly(p, tooltip="text")

Age Brackets by Year

plotdata_by_year = data %>% 
  group_by(year) %>% 
  summarise(count_year = n())

plotdata_by_year_div = data %>% 
  group_by(year, Division) %>% 
  summarise(count_year_div = n())

plotdata = plotdata_by_year %>% 
  plyr::join(plotdata_by_year_div, by = "year", type = "full") %>% 
  mutate(percent = round(count_year_div/count_year*100,1))
  
plotdata %>% 
  ggplot(aes(x = year, y = percent, fill = Division)) + 
  geom_bar(stat = "identity", position = "stack")

Race Times

p = data %>% 
  mutate_at("year", as.factor) %>% 
  ggplot(aes(x=year, y=TimeMins)) + 
  geom_boxplot()

ggplotly(p, tooltip="text")
p = data %>% 
  dplyr::filter(!(Division %in% c("W8099", "W7579"))) %>% 
  mutate_at("year", as.factor) %>% 
  ggplot(aes(x=year, y=TimeMins, fill=year)) + 
  geom_boxplot() +
  facet_wrap(. ~ Division, ncol=4) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  coord_flip()

p

#ggplotly(p, tooltip="text")
p = data %>% 
  dplyr::filter(Division %in% c("W2529", "W3034", "W3539", "W4044")) %>% 
  mutate_at("year", as.factor) %>% 
  ggplot(aes(x=year, y=TimeMins, fill=year)) + 
  geom_boxplot() +
  facet_wrap(. ~ Division, ncol=4) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  coord_flip()

m <- list(
  l = 100,
  r = 50,
  b = 100,
  t = 50,
  pad = 4
)
gp = ggplotly(p, tooltip="text", width=800, height=600) %>%
  layout(
    autosize = F,
    margin=m,
    legend = list(orientation = "h", xanchor = "center", x = 0.5, y = -0.15)
    ) %>% 
  style(legendgroup = NULL)
# gp

# find the annotation you want to move
# Based on https://stackoverflow.com/questions/42763280/r-ggplot-and-plotly-axis-margin-wont-change
labels = c("year", "TimeMins")
for(i in seq_along(gp[['x']][['layout']][['annotations']])){
  for(label in labels){
    if (gp[['x']][['layout']][['annotations']][[i]]$text == label){
      print(paste(
        label, "Index: ", i, "X, Y: ",
        gp[['x']][['layout']][['annotations']][[i]]$x,
        gp[['x']][['layout']][['annotations']][[i]]$y)
      )
    }
  }
}
## [1] "TimeMins Index:  1 X, Y:  0.5 -0.0356164383561644"
## [1] "year Index:  2 X, Y:  -0.0321917808219178 0.5"
## [1] "year Index:  7 X, Y:  1.02 1"
# X Label
gp[['x']][['layout']][['annotations']][[1]]$y = -0.1
# Y Label
gp[['x']][['layout']][['annotations']][[2]]$x = -0.075
# Legend XY
gp[['x']][['layout']][['annotations']][[7]]$x = -0.025
gp[['x']][['layout']][['annotations']][[7]]$y = -0.225
gp